ephrin_binding.ipynb¶

Analyze receptor selection data using soluble bat Ephrin-B2 or -B3

  • Written by Brendan Larsen
In [1]:
# this cell is tagged as parameters for `papermill` parameterization
# input configs
altair_config = None
nipah_config = None

# input files
binding_E2_file = None
binding_E3_file = None

# output images
entry_binding_combined_corr_plot = None
entry_binding_combined_corr_plot_agg = None
E2_E3_correlation = None
E2_E3_correlation_site = None
binding_by_site_plot = None
entry_binding_corr_heatmap = None
binding_corr_heatmap = None
binding_region_bubble_plot = None
combined_contact_ranked_bar_output = None
In [2]:
# Parameters
nipah_config = "nipah_config.yaml"
altair_config = "data/custom_analyses_data/theme.py"
binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"
entry_binding_combined_corr_plot = (
    "results/images/entry_binding_combined_corr_plot.html"
)
entry_binding_combined_corr_plot_agg = (
    "results/images/entry_binding_combined_corr_plot_agg.html"
)
E2_E3_correlation = "results/images/E2_E3_correlation.html"
E2_E3_correlation_site = "results/images/E2_E3_correlation_site.html"
binding_by_site_plot = "results/images/binding_by_site_plot.html"
entry_binding_corr_heatmap = "results/images/entry_binding_corr_heatmap.html"
binding_corr_heatmap = "results/images/binding_corr_heatmap.html"
binding_region_bubble_plot = "results/images/binding_region_bubble_plot.html"
combined_contact_ranked_bar_output = (
    "results/images/combined_contact_ranked_bar_output.html"
)
In [3]:
import math
import os
import re
import altair as alt
import numpy as np
import pandas as pd
import scipy.stats
import yaml
In [4]:
# allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()

if (
    os.getcwd()
    == "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
):
    pass
    print("Already in correct directory")
else:
    os.chdir(
        "/fh/fast/bloom_j/computational_notebooks/blarsen/2023/Nipah_Malaysia_RBP_DMS/"
    )
    print("Setup in correct directory")
Setup in correct directory

hard paths for running in interactive mode¶

In [5]:
if nipah_config is None:
    ##hard paths in case don't want to run with snakemake
    print("loading hard paths")
    altair_config = "data/custom_analyses_data/theme.py"
    nipah_config = "nipah_config.yaml"

    # input files
    binding_E2_file = "results/filtered_data/binding/e2_binding_filtered.csv"
    binding_E3_file = "results/filtered_data/binding/e3_binding_filtered.csv"

Run config files to setup altair theme and config variables¶

In [6]:
if altair_config:
    with open(altair_config, "r") as file:
        exec(file.read())

with open(nipah_config) as f:
    config = yaml.safe_load(f)

Import the filtered binding and entry data for bEFNB2 and bEFNB3¶

In [7]:
# import binding data
df_E2_filter = pd.read_csv(binding_E2_file)
display(df_E2_filter.head(3))
df_E3_filter = pd.read_csv(binding_E3_file)
display(df_E3_filter.head(3))
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type
0 71 Q D -0.7817 1.6460 3.25 -1.164 0.8890 4.500 1.0 negative
1 71 Q E 0.1659 0.4309 3.50 -1.255 0.3123 5.375 1.0 negative
2 71 Q F -0.3429 0.5299 3.00 -1.058 0.6637 4.625 1.0 aromatic
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type
0 71 Q C -0.1332 0.1525 3.0 -0.7227 0.7828 3.000 1.0 special
1 71 Q D -0.2937 0.3305 4.0 -0.3884 0.6369 3.429 1.0 negative
2 71 Q E -0.3527 0.1762 4.0 -0.2482 0.9791 4.571 1.0 negative

Merge data¶

In [8]:
## Merge the filtered EFNB2 and EFNB3 DataFrames for combined analysis.
df_binding_effect_merge = pd.merge(
    df_E2_filter,
    df_E3_filter,
    on=["site", "wildtype", "mutant"],
    suffixes=["_E2", "_E3"],
    how="outer",
)
display(df_binding_effect_merge.head(3))
## Add a 'selection' column to distinguish between EFNB2 and EFNB3 data.
df_E2_filter["selection"] = "bEFNB2"
df_E3_filter["selection"] = "bEFNB3"

## Concatenate the DataFrames for plotting or further analysis.
df_binding_effect_concat = pd.concat([df_E2_filter, df_E3_filter])
display(df_binding_effect_concat.head(3))
site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3
0 71 Q D -0.7817 1.6460 3.25 -1.164 0.8890 4.500 1.0 negative -0.29370 0.3305 4.0 -0.3884 0.6369 3.429 1.0 negative
1 71 Q E 0.1659 0.4309 3.50 -1.255 0.3123 5.375 1.0 negative -0.35270 0.1762 4.0 -0.2482 0.9791 4.571 1.0 negative
2 71 Q F -0.3429 0.5299 3.00 -1.058 0.6637 4.625 1.0 aromatic -0.03982 0.1283 3.0 -0.4973 0.3080 3.286 1.0 aromatic
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
0 71 Q D -0.7817 1.6460 3.25 -1.164 0.8890 4.500 1.0 negative bEFNB2
1 71 Q E 0.1659 0.4309 3.50 -1.255 0.3123 5.375 1.0 negative bEFNB2
2 71 Q F -0.3429 0.5299 3.00 -1.058 0.6637 4.625 1.0 aromatic bEFNB2
In [9]:
# What are the top 5 highest and lowest binding mutants for EFNB2 and EFNB3?
def find_highest_lowest(df, name):
    print(f"We are analyzing {name}\n")
    print(f"The total number of mutants was: {df.shape[0]}\n")
    tmp_df = df.sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected:")
    display(tmp_df.head(5))

    tmp_df_high = df.sort_values(by="binding_mean", ascending=False)
    print("\nThese are the highest binding mutants detected:\n")
    display(tmp_df_high.head(5))

    # What about mutants with positive entry scores?
    tmp_df = df[df["effect"] > 0].sort_values(by="binding_mean")
    print("These are the lowest binding mutants detected with positive entry scores:")
    display(tmp_df.head(5))

    tmp_df_high = df[df["effect"] > 0].sort_values(by="binding_mean", ascending=False)
    print(
        "\nThese are the highest binding mutants detected with positive entry scores:\n"
    )
    display(tmp_df_high.head(5))

    mean_df = df.groupby('site')['binding_mean'].sum().reset_index()
    display(mean_df.sort_values(by='binding_mean',ascending=False).head(10))


find_highest_lowest(df_E2_filter, "bEFNB2")
find_highest_lowest(df_E3_filter, "bEFNB3")
We are analyzing bEFNB2

The total number of mutants was: 6804

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
5484 501 E K -5.280 0.9284 24.50 -0.6440 0.6061 32.00 1.0 positive bEFNB2
5861 532 A V -5.259 1.1400 16.25 -0.3199 0.3122 17.88 1.0 hydrophobic bEFNB2
1817 236 R H -5.054 0.8828 22.00 -1.2530 0.3352 29.38 1.0 positive bEFNB2
5354 490 Q K -4.966 1.0810 8.50 -0.6916 0.5391 11.38 1.0 positive bEFNB2
5352 490 Q H -4.752 0.8619 8.25 -0.4499 0.6679 10.88 1.0 positive bEFNB2
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6308 566 F C 2.160 2.1190 5.00 -0.5326 0.4547 5.500 0.75 special bEFNB2
6491 580 I S 2.152 1.7680 4.75 -1.0010 0.6921 6.375 1.00 hydrophilic bEFNB2
3054 331 N E 2.144 1.2330 3.00 -0.8872 0.8436 3.500 1.00 negative bEFNB2
2111 269 N E 2.075 2.2410 5.00 -1.0480 0.4835 7.875 1.00 negative bEFNB2
6596 586 N T 1.962 0.5779 5.25 -0.8163 0.8231 7.000 1.00 hydrophilic bEFNB2
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6225 558 A V -4.647 0.7295 17.25 0.1182 0.3640 18.50 1.0 hydrophobic bEFNB2
6211 557 N D -4.627 0.3989 9.50 0.3848 0.4791 11.12 1.0 negative bEFNB2
5771 526 L H -4.546 0.7822 5.25 0.1102 0.3920 6.75 1.0 positive bEFNB2
6510 581 Y W -4.519 1.0130 10.75 0.0907 0.4101 11.50 1.0 aromatic bEFNB2
5367 491 S H -4.505 0.9350 8.50 0.1124 0.2502 10.25 1.0 positive bEFNB2
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
2334 282 C S 1.609 1.056 4.50 0.3378 0.3251 3.714 0.50 hydrophilic bEFNB2
4833 450 Q I 1.457 1.194 5.00 0.1418 0.3988 6.000 1.00 hydrophobic bEFNB2
3193 339 S A 1.389 2.087 2.00 0.4493 0.1488 2.625 0.75 hydrophobic bEFNB2
3522 361 T A 1.371 2.108 3.50 0.3626 0.4499 4.875 1.00 hydrophobic bEFNB2
817 164 N S 1.297 1.136 3.75 0.2398 0.2906 6.125 1.00 hydrophilic bEFNB2
site binding_mean
250 331 15.73280
473 564 15.08690
97 172 13.88370
225 306 13.85177
493 586 13.36944
103 178 9.99930
151 227 9.97030
90 165 8.74900
239 320 8.57420
496 589 8.56552
We are analyzing bEFNB3

The total number of mutants was: 6618

These are the lowest binding mutants detected:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6124 555 D K -2.147 1.2250 6.0 0.08504 0.4221 6.714 1.0 positive bEFNB3
6127 555 D R -1.510 0.4205 4.0 0.07591 0.5260 4.143 1.0 positive bEFNB3
5810 530 Q L -1.447 0.4772 3.5 -1.01900 0.8623 5.857 1.0 hydrophobic bEFNB3
6413 583 T R -1.410 0.4465 5.5 -1.00500 0.4854 6.286 1.0 positive bEFNB3
6410 583 T K -1.247 0.1132 5.0 -0.72780 0.6441 7.000 1.0 positive bEFNB3
These are the highest binding mutants detected:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6460 589 R G 2.006 0.65910 7.0 -0.9655 0.5763 8.286 1.0 special bEFNB3
6379 580 I M 1.337 0.53380 5.5 -0.8155 0.5307 6.714 1.0 hydrophobic bEFNB3
2361 270 V Q 1.329 1.02800 5.0 -0.8846 0.6509 5.429 1.0 hydrophilic bEFNB3
2313 267 M Q 1.291 1.37100 4.5 -0.7981 0.8175 5.667 1.0 hydrophilic bEFNB3
5462 492 Q L 1.261 0.06666 5.5 0.4833 0.1827 5.143 1.0 hydrophobic bEFNB3
These are the lowest binding mutants detected with positive entry scores:
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
6124 555 D K -2.1470 1.2250 6.0 0.08504 0.42210 6.714 1.0 positive bEFNB3
6127 555 D R -1.5100 0.4205 4.0 0.07591 0.52600 4.143 1.0 positive bEFNB3
5813 530 Q W -0.9974 0.5473 3.0 0.01818 0.66470 3.286 1.0 aromatic bEFNB3
6118 555 D A -0.8295 0.1999 3.5 0.37550 0.23450 3.714 1.0 hydrophobic bEFNB3
4852 441 P V -0.7461 0.1815 5.5 0.37080 0.08612 5.286 1.0 hydrophobic bEFNB3
These are the highest binding mutants detected with positive entry scores:

site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
5462 492 Q L 1.2610 0.06666 5.5 0.48330 0.1827 5.143 1.0 hydrophobic bEFNB3
1708 211 G F 1.1690 0.28900 5.0 0.16370 0.5716 5.714 1.0 aromatic bEFNB3
6543 598 P G 1.1540 1.55900 4.5 0.23870 0.5146 5.429 1.0 special bEFNB3
6098 553 S W 0.9943 0.43520 9.5 0.16690 0.4060 9.857 1.0 aromatic bEFNB3
6402 582 D W 0.9761 0.33840 6.5 0.06342 0.3096 7.143 1.0 aromatic bEFNB3
site binding_mean
490 589 11.707770
87 161 7.332800
469 564 6.721080
225 306 6.492740
59 132 6.168804
242 323 6.101950
470 566 5.968996
261 342 5.728920
129 204 5.653570
175 254 5.152500

Find the top overlapping binders for both bEFNB2 and bEFNB3¶

In [10]:
def find_good_binding_for_both(df):
    e2_good = df.sort_values(by='binding_mean_E2',ascending=False)
    e3_good = df.sort_values(by='binding_mean_E3',ascending=False)
    e2_good = e2_good.head(50)
    e3_good = e3_good.head(50)
    combo = pd.merge(e2_good,e3_good,on=['site','wildtype','mutant'],how='inner')
    display(combo)

find_good_binding_for_both(df_binding_effect_merge)
site wildtype mutant binding_mean_E2_x binding_std_E2_x times_seen_binding_E2_x effect_E2_x effect_std_E2_x times_seen_cell_entry_E2_x frac_models_E2_x ... frac_models_E2_y mutant_type_E2_y binding_mean_E3_y binding_std_E3_y times_seen_binding_E3_y effect_E3_y effect_std_E3_y times_seen_cell_entry_E3_y frac_models_E3_y mutant_type_E3_y
0 566 F C 2.160 2.1190 5.000 -0.5326 0.4547 5.500 0.75 ... 0.75 special 1.1320 1.194000 3.5 -0.8507 0.7599 3.833 1.0 special
1 580 I S 2.152 1.7680 4.750 -1.0010 0.6921 6.375 1.00 ... 1.00 hydrophilic 0.8187 0.030890 6.0 0.1163 0.3235 5.571 1.0 hydrophilic
2 211 G F 1.957 0.5488 4.750 -0.6654 0.4262 5.125 1.00 ... 1.00 aromatic 1.1690 0.289000 5.0 0.1637 0.5716 5.714 1.0 aromatic
3 553 S W 1.945 0.2867 10.000 -0.2967 0.4293 13.250 1.00 ... 1.00 aromatic 0.9943 0.435200 9.5 0.1669 0.4060 9.857 1.0 aromatic
4 139 N W 1.930 0.4834 5.750 -0.6672 0.6217 8.000 1.00 ... 1.00 aromatic 1.0030 0.004575 4.5 -0.4441 0.4452 6.286 1.0 aromatic
5 589 R G 1.668 0.9018 7.333 -1.3800 0.4560 11.880 0.75 ... 0.75 special 2.0060 0.659100 7.0 -0.9655 0.5763 8.286 1.0 special

6 rows × 35 columns

In [11]:
# Compare E2 and E3 binders
def find_highest_lowest(df):
    df["binding_diff"] = (df["binding_mean_E2"] - df["binding_mean_E3"]).abs()
    print(
        "These are the mutants with the biggest difference between EFNB2 and EFNB3:\n"
    )
    display(df.sort_values(by="binding_diff", ascending=False).head(10))

    # calculate aggregate differences
    agg_df = (
        df.groupby("site")[["binding_mean_E2", "binding_mean_E3", "binding_diff"]]
        .mean()
        .reset_index()
    )
    print("These are the sites with the biggest difference between EFNB2 and EFNB3:\n")
    display(agg_df.sort_values(by="binding_diff", ascending=False).head(5))


find_highest_lowest(df_binding_effect_merge)
# find_highest_lowest(df_E3_filter,'EFNB3')
These are the mutants with the biggest difference between EFNB2 and EFNB3:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 binding_diff
5835 530 Q F -4.015 0.46650 6.25 0.46740 0.2023 5.375 1.0 aromatic 0.084370 0.06198 6.0 0.3592 0.2738 6.286 1.0 aromatic 4.099370
5362 490 Q Y -4.290 0.90020 5.50 -1.08800 0.4004 6.375 1.0 aromatic -0.461000 0.57400 5.5 -0.3137 0.7967 5.714 1.0 aromatic 3.829000
5367 491 S H -4.505 0.93500 8.50 0.11240 0.2502 10.250 1.0 positive -0.871800 0.78980 7.0 -1.3810 0.4865 8.286 1.0 positive 3.633200
5366 491 S G -4.181 0.59490 6.50 0.13020 0.2975 8.625 1.0 special -0.562100 0.04726 5.5 -0.2358 0.6084 6.286 1.0 special 3.618900
5361 490 Q W -4.205 1.00500 3.75 0.23460 0.3779 4.750 1.0 aromatic -0.586500 0.32260 3.0 -0.1506 0.4575 3.286 1.0 aromatic 3.618500
6627 588 I P -2.349 0.84050 5.25 -0.25550 0.3004 4.750 1.0 special 1.183000 0.25530 5.5 -0.6131 0.3819 5.429 1.0 special 3.532000
5385 492 Q K -3.523 0.54860 11.00 0.35270 0.2320 11.620 1.0 positive 0.006126 0.07740 10.0 0.3291 0.2404 11.570 1.0 positive 3.529126
5389 492 Q R -2.823 0.61110 19.25 -0.07662 0.2715 23.120 1.0 positive 0.643700 0.05388 20.0 0.5095 0.1192 20.570 1.0 positive 3.466700
1864 239 S H -3.440 0.08816 5.25 0.33410 0.2721 4.500 1.0 positive -0.041160 0.83220 4.0 -1.2190 0.6930 5.000 1.0 positive 3.398840
5844 530 Q V -4.327 0.83770 6.00 0.10570 0.2693 8.625 1.0 hydrophobic -0.987000 0.89600 5.5 -1.4430 0.7389 7.000 1.0 hydrophobic 3.340000
These are the sites with the biggest difference between EFNB2 and EFNB3:

site binding_mean_E2 binding_mean_E3 binding_diff
416 505 -3.790625 -0.770900 2.897100
440 530 -3.631593 -0.559266 2.797646
406 491 -3.575407 -0.364667 2.785117
496 588 -3.474250 0.037600 2.686600
408 494 -3.915765 -0.615000 2.411333

Make plots showing correlation between binding and entry for EFNB2 and EFNB3¶

In [12]:
def plot_corr_binding_entry_updated(df, flag):
    # Define interactive selectors for variant selection.
    # 'variant_selector' is for selecting individual variants based on 'site' and 'mutant'.
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site", "mutant"], value=0
    )
    # 'variant_selector_agg' is for aggregated selection based on 'site' only.
    variant_selector_agg = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )

    # Initialize an empty list to store charts for each unique selection in 'df'.
    empty_chart = []
    
    # Loop through each unique cell selection in the DataFrame.
    for cell in list(df["selection"].unique()):
        # Filter DataFrame for the current selection.
        tmp_df = df[df["selection"] == cell]
        
        # Check if aggregation flag is True to aggregate data.
        if flag == True:
            # Aggregate data by 'site', summing 'binding_median' and 'effect', then reset index.
            agg_df = (
                tmp_df.groupby("site")[["binding_mean", "effect"]].mean().reset_index()
            )
            # Create a chart using aggregated data with specified encodings.
            chart = (
                alt.Chart(agg_df)
                .mark_point(stroke="black", filled=True)  # Use filled points with black stroke.
                .encode(
                    x=alt.X(
                        "effect",
                        title=f"Mean entry in CHO-{cell}",
                        axis=alt.Axis(grid=True),
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"Mean {cell} binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector_agg'.
                    opacity=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0.2)
                    ),
                    size=alt.condition(
                        variant_selector_agg, alt.value(100), alt.value(50)
                    ),
                    strokeWidth=alt.condition(
                        variant_selector_agg, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector_agg, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=["site", "binding_mean", "effect"],
                )
                .add_params(variant_selector_agg)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

        else:
            # Create a chart for individual data points with specified encodings.
            chart = (
                alt.Chart(tmp_df)
                .mark_point(stroke="black", filled=True)
                .encode(
                    x=alt.X(
                        "effect", title=f"Entry in CHO-{cell}", axis=alt.Axis(grid=True)
                    ),
                    y=alt.Y(
                        "binding_mean",
                        title=f"{cell} binding",
                        axis=alt.Axis(grid=True),
                    ),
                    # Dynamic opacity, size, strokeWidth, and color based on 'variant_selector'.
                    opacity=alt.condition(
                        variant_selector, alt.value(1), alt.value(0.1)
                    ),
                    size=alt.condition(variant_selector, alt.value(50), alt.value(20)),
                    strokeWidth=alt.condition(
                        variant_selector, alt.value(1), alt.value(0)
                    ),
                    color=alt.condition(
                        variant_selector, alt.value("orange"), alt.value("black")
                    ),
                    tooltip=[
                        "site",
                        "wildtype",
                        "mutant",
                        "binding_mean",
                        "times_seen_binding",
                        "effect",
                    ],
                )
                .add_params(variant_selector)
            )
            # Add the chart to the list.
            empty_chart.append(chart)

    # Combine all charts horizontally with a title.
    combined_chart = alt.hconcat(
        *empty_chart, title=alt.Title("Correlation between binding and entry")
    )
    
    # Return the combined chart for display or further use.
    return combined_chart

# Generate and display plots for non-aggregated data.
entry_binding_corr_plot = plot_corr_binding_entry_updated(df_binding_effect_concat, False)
entry_binding_corr_plot.display()

# Save the plot 
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_plot.save(entry_binding_combined_corr_plot)

# Repeat for aggregated data.
entry_binding_corr_plot_agg = plot_corr_binding_entry_updated(df_binding_effect_concat, True)
entry_binding_corr_plot_agg.display()

# Save the aggregated plot
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_plot_agg.save(entry_binding_combined_corr_plot_agg)

Same plot as above, but slightly different format¶

In [13]:
def plot_entry_binding_corr_heatmap(df):
    empty_chart = []

    for cell in list(df["selection"].unique()):
        tmp_df = df[df["selection"] == cell]
        chart = (
            alt.Chart(tmp_df, title=f"{cell}")
            .mark_rect()
            .encode(
                x=alt.X(
                    "effect", title="Cell entry", axis=alt.Axis(values=[-2, -1, 0, 1])
                ).bin(maxbins=50),
                y=alt.Y(
                    "binding_mean",
                    title="Receptor binding",
                    axis=alt.Axis(values=[-4, -2, 0, 2]),
                ).bin(maxbins=50),
                color=alt.Color("count()", title="Count").scale(type='log'),
            )
        ).properties(width=200,height=200)
        empty_chart.append(chart)

    combined_chart = alt.hconcat(
        *empty_chart, 
    ).resolve_scale(y="shared", x="shared", color="shared")
    return combined_chart


entry_binding_corr_heat = plot_entry_binding_corr_heatmap(df_binding_effect_concat)
entry_binding_corr_heat.display()
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_heat.save(entry_binding_corr_heatmap)

Calculate some stats on binding¶

In [14]:
def overall_stats(df, effect, name):
    # Now group sites and find sites where all mutants are deleterious
    filtered_df = df.groupby("site").filter(lambda group: (group[effect] < -0.25).all())
    # Which sites are these?
    unique = filtered_df["site"].unique()
    # Convert unique to a Pandas Series
    unique_series = pd.Series(unique)

    # Find the common elements that are also contact sites
    unique_contact_bool = unique_series.isin(config["contact_sites"])
    # Filter and get the common elements
    common_elements = unique_series[unique_contact_bool]

    print(f"The dataset we are analyzing is: {name}\n")

    # Print the common elements
    print(
        f"Here are the contact sites that only have negative binding scores: {list(common_elements)}\n"
    )

    print(f"There are {len(unique)} sites with all negative binding score mutants\n")
    print(
        f"These are the sites with all negative binding score mutants: {list(unique)}\n"
    )

    # Now find sites with low and high binding (median)
    median_df = (
        df.groupby("site")["binding_mean"]
        .max()
        .reset_index()
        .sort_values(by="binding_mean", ascending=False)
    )
    print("These are the sites with the highest binding mutants:\n")
    display(median_df.head(5))

    # Now calculate mutant number
    total_mutants = df.shape[0]

    mutants_above_cutoff_tolerated = df[df["effect"] > 0]
    mutants_above_cutoff_tolerated = mutants_above_cutoff_tolerated[
        ["site", "effect", "binding_mean", "wildtype", "mutant"]
    ]

    total_sites = df["site"].unique().shape[0]

    print(f"The total number of sites are: {total_sites}")


overall_stats(df_E2_filter, "binding_mean", "EFNB2")
overall_stats(df_E3_filter, "binding_mean", "EFNB3")
The dataset we are analyzing is: EFNB2

Here are the contact sites that only have negative binding scores: [242, 389, 488, 490, 491, 501, 504, 505, 531, 532, 533, 557, 579, 581, 588]

There are 41 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [100, 116, 220, 236, 238, 242, 243, 248, 346, 351, 352, 389, 390, 400, 435, 438, 441, 460, 467, 486, 487, 488, 490, 491, 494, 495, 497, 501, 504, 505, 526, 531, 532, 533, 557, 579, 581, 584, 585, 588, 590]

These are the sites with the highest binding mutants:

site binding_mean
474 566 2.160
487 580 2.152
250 331 2.144
188 269 2.075
493 586 1.962
The total number of sites are: 510
The dataset we are analyzing is: EFNB3

Here are the contact sites that only have negative binding scores: [389, 488, 501, 505, 531, 532]

There are 15 sites with all negative binding score mutants

These are the sites with all negative binding score mutants: [108, 352, 389, 467, 486, 488, 494, 495, 497, 501, 505, 510, 531, 532, 584]

These are the sites with the highest binding mutants:

site binding_mean
490 589 2.006
482 580 1.337
189 270 1.329
186 267 1.291
405 492 1.261
The total number of sites are: 504

Find sites with opposite effects on binding¶

In [15]:
# find sites that are different
def find_biggest_differences(df):
    efnb2_good_efnb3_bad = df[
        (df["binding_mean_E2"] > 0.5) & (df["binding_mean_E3"] < -0.5)
    ].sort_values(by="binding_mean_E2", ascending=False)
    print('mutantions good for efnb2 binding, bad for efnb3 binding:\n')
    display(efnb2_good_efnb3_bad)

    efnb2_bad_efnb3_good = df[
        (df["binding_mean_E2"] < -0.5) & (df["binding_mean_E3"] > 0.5)
    ].sort_values(by="binding_mean_E3", ascending=False)
    print('mutantions bad for efnb2 binding, good for efnb3 binding:\n')
    display(efnb2_bad_efnb3_good)


find_biggest_differences(df_binding_effect_merge)
mutantions good for efnb2 binding, bad for efnb3 binding:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 binding_diff
6041 546 L H 1.5230 1.1450 5.25 -1.1480 0.3866 6.750 1.0 positive -0.5431 0.6654 3.5 -0.85080 0.5440 5.143 1.0 positive 2.0661
2620 303 P C 1.3450 1.5770 5.00 -0.7175 0.5900 5.000 1.0 special -0.5659 0.0975 4.0 -0.69680 0.5942 4.857 1.0 special 1.9109
122 79 N S 1.2920 1.6520 3.00 -0.6352 0.4964 2.375 1.0 hydrophilic -0.6464 0.2595 3.0 -0.12940 0.5688 3.429 1.0 hydrophilic 1.9384
103 78 D I 0.6586 0.6340 5.25 -0.9306 0.3975 6.500 1.0 hydrophobic -0.5665 0.7552 5.0 -0.51670 0.8447 5.143 1.0 hydrophobic 1.2251
1681 224 M Q 0.6473 0.8796 3.50 0.1659 0.3148 2.750 1.0 hydrophilic -0.5806 0.3813 3.5 0.20550 0.2920 4.286 1.0 hydrophilic 1.2279
5707 520 I G 0.5914 1.0130 5.00 -1.4650 0.3487 8.000 1.0 special -0.5305 0.1476 5.5 -1.30900 0.3187 7.000 1.0 special 1.1219
1031 178 V T 0.5288 0.7257 3.75 -0.5276 0.7591 4.250 1.0 hydrophilic -0.5130 0.5995 3.0 -0.01921 0.4254 3.571 1.0 hydrophilic 1.0418
mutantions bad for efnb2 binding, good for efnb3 binding:

site wildtype mutant binding_mean_E2 binding_std_E2 times_seen_binding_E2 effect_E2 effect_std_E2 times_seen_cell_entry_E2 frac_models_E2 mutant_type_E2 binding_mean_E3 binding_std_E3 times_seen_binding_E3 effect_E3 effect_std_E3 times_seen_cell_entry_E3 frac_models_E3 mutant_type_E3 binding_diff
6627 588 I P -2.3490 0.8405 5.25 -0.25550 0.3004 4.750 1.0 special 1.1830 0.255300 5.5 -0.6131 0.3819 5.429 1.0 special 3.5320
6733 598 P G -0.5523 0.7852 5.00 -0.86740 0.9393 4.625 1.0 special 1.1540 1.559000 4.5 0.2387 0.5146 5.429 1.0 special 1.7063
5236 479 W S -0.8017 1.2720 3.25 -1.30400 0.8784 5.250 1.0 hydrophilic 0.7307 2.957000 2.0 -1.4690 0.6120 3.286 1.0 hydrophilic 1.5324
3179 338 R G -0.5172 0.5068 4.50 0.29960 0.4223 5.250 1.0 special 0.6836 0.975700 4.0 0.1580 0.3395 3.429 1.0 special 1.2008
5363 491 S A -0.9317 0.5371 5.50 -0.66860 0.8056 6.250 1.0 hydrophobic 0.6451 0.009846 6.5 0.1429 0.3582 6.143 1.0 hydrophobic 1.5768
5389 492 Q R -2.8230 0.6111 19.25 -0.07662 0.2715 23.120 1.0 positive 0.6437 0.053880 20.0 0.5095 0.1192 20.570 1.0 positive 3.4667

Find correlations between bEFNB2 and bEFNB3 binding¶

In [16]:
def plot_entry_binding_corr(df):
    chart = (
        alt.Chart(df, title="Correlation Between Mutant Binding Scores")
        .mark_rect()
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title="bEFNB2 binding",
                axis=alt.Axis(values=[-4,-2, 0, 2]),
            ).bin(maxbins=50),
            y=alt.Y(
                "binding_mean_E3",
                title="bEFNB3 binding",
                axis=alt.Axis(values=[-2, 0, 2]),
            ).bin(maxbins=50),
            color=alt.Color("count()", title="Count").scale(type='log'),
        )
    ).properties(width=200,height=200)
    return chart


entry_binding_corr_heatmap_1 = plot_entry_binding_corr(df_binding_effect_merge)
entry_binding_corr_heatmap_1.display()
if entry_binding_combined_corr_plot is not None:
    entry_binding_corr_heatmap_1.save(binding_corr_heatmap)

Plot correlations of binding mutants in scatterplots, and color different subsets¶

In [17]:
def plot_affinity_BLI_mutants(df, highlight_conditions, color,subset=None):
    df = df.dropna().round(2).copy()
    df['site_mutant'] = df['site'].astype(str) + df['mutant'].astype(str)

    if subset is not None:
        df = df[df['site'].isin(subset)]

    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        df["binding_mean_E2"], df["binding_mean_E3"]
    )
    
    # make correlation chart
    chart = alt.Chart(df,title=alt.Title("Correlation Between Mutant Binding Scores")
    ).mark_point(
        color="gray", 
        size=30, 
        opacity=0.4, 
        filled=True
    ).encode(
        x=alt.X("binding_mean_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=4)),
        y=alt.Y("binding_mean_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=4)),
        tooltip=[
            "site",
            "wildtype",
            "mutant",
            "binding_mean_E2",
            "binding_mean_E3",
            "effect_E2",
            "effect_E3",
        ],
    )

    #make orange circles for specific data
    highlight = chart.transform_filter(highlight_conditions).mark_point(
        color=color, size=40, opacity=1, filled=True, stroke='black', strokeWidth=1
    )
    #write text near the orange circles
    text_on_point = chart.transform_filter(highlight_conditions).mark_text(
            align='center',baseline='top',dy=-15,fontSize=14
    ).encode(text='site_mutant')
    
    min = int(df["binding_mean_E2"].min())
    max = int(df["binding_mean_E3"].max())
    text = (
        alt.Chart({"values": [{"x": min, "y": max, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=-10, dy=-20)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart + text_on_point + highlight #+ text
    return chart_and_text.properties(height=300,width=300)

Sites selected for BLI validation¶

In [18]:
# these are the data we want to show in orange circles
highlight_conditions = (
        (alt.datum.site == 244) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 305) & (alt.datum.mutant == 'W') | 
        (alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'Y') |
        (alt.datum.site == 559) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 588) & (alt.datum.mutant == 'V') 
)

E2_E3_corr = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions,'#e49444')
E2_E3_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_corr.save(E2_E3_correlation)

Sites selected for neutralization validations¶

In [19]:
highlight_conditions_neut = (
        (alt.datum.site == 333) & (alt.datum.mutant == 'Q') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'R') | 
        (alt.datum.site == 507) & (alt.datum.mutant == 'I') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 555) & (alt.datum.mutant == 'K') 
)
neut_muts = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions_neut,'#5778a4')
neut_muts.display()

Top sites different in bEFNB2/3¶

In [20]:
highlight_conditions_top = (
        (alt.datum.site == 580) & (alt.datum.mutant == 'S') |
        (alt.datum.site == 598) & (alt.datum.mutant == 'G') | 
        (alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        #(alt.datum.site == 553) & (alt.datum.mutant == 'W') |
        (alt.datum.site == 588) & (alt.datum.mutant == 'P') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'K') |
        (alt.datum.site == 239) & (alt.datum.mutant == 'H') |
        #(alt.datum.site == 211) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 546) & (alt.datum.mutant == 'H') |
        (alt.datum.site == 143) & (alt.datum.mutant == 'Q') |
        (alt.datum.site == 331) & (alt.datum.mutant == 'E') |
        (alt.datum.site == 589) & (alt.datum.mutant == 'G') 
)
top_muts = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions_top,'#d1615d')
top_muts.display()

Same as above, top sites different in bEFNB2/3 but only for contact sites¶

In [21]:
highlight_conditions_top_contact = (
        (alt.datum.site == 580) & (alt.datum.mutant == 'S') |
        #(alt.datum.site == 458) & (alt.datum.mutant == 'Y') | 
        (alt.datum.site == 492) & (alt.datum.mutant == 'L') |
        (alt.datum.site == 491) & (alt.datum.mutant == 'A') |
        (alt.datum.site == 588) & (alt.datum.mutant == 'P') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'R') |
        (alt.datum.site == 530) & (alt.datum.mutant == 'F') |
        (alt.datum.site == 492) & (alt.datum.mutant == 'K') |
        (alt.datum.site == 239) & (alt.datum.mutant == 'H') 
)
top_muts_contact = plot_affinity_BLI_mutants(df_binding_effect_merge,highlight_conditions_top_contact,'yellow',subset = config['contact_sites'])
top_muts_contact.display()

Plot correlations of mutants for individual sites of interest with letters of each mutant plotted¶

In [22]:
def plot_affinity_individual_mutants(df,mutant):
    df = df.dropna().round(2).copy()
    df = df[df['site'] == mutant]
    if df.empty:
        print('nothing here')
        pass
    else:
        wildtype_site = (df['wildtype'].astype(str) + df['site'].astype(str)).unique()[0]

        chart = alt.Chart(df,title=alt.Title(f"Site {wildtype_site}")
        ).mark_text(
            size=15
        ).encode(
            x=alt.X("binding_mean_E2", title=("bEFNB2 binding"),axis=alt.Axis(tickCount=3)),
            y=alt.Y("binding_mean_E3", title=("bEFNB3 binding"),axis=alt.Axis(tickCount=3)),
            text=alt.Text('mutant'),
            color=alt.Color('mutant_type_E2',title='Amino acid type'),
            tooltip=[
                "site",
                "wildtype",
                "mutant",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    
        return chart.properties(height=200,width=200)
In [23]:
indiv_mutant_589 = plot_affinity_individual_mutants(df_binding_effect_merge,589)
indiv_mutant_589.display()
In [24]:
indiv_mutant_553 = plot_affinity_individual_mutants(df_binding_effect_merge,553)
indiv_mutant_553.display()
In [25]:
indiv_mutant_580 = plot_affinity_individual_mutants(df_binding_effect_merge,580)
indiv_mutant_582 = plot_affinity_individual_mutants(df_binding_effect_merge,582)
indiv_mutant_586 = plot_affinity_individual_mutants(df_binding_effect_merge,586)
indiv_mutant_587 = plot_affinity_individual_mutants(df_binding_effect_merge,587)
indiv_mutant_589 = plot_affinity_individual_mutants(df_binding_effect_merge,589)
loop580_590 = alt.vconcat(indiv_mutant_580, indiv_mutant_582,indiv_mutant_586,indiv_mutant_587,indiv_mutant_589).resolve_scale(x='shared',y='shared',color='shared')
loop580_590.display()
In [26]:
indiv_mutant_172 = plot_affinity_individual_mutants(df_binding_effect_merge,172)
indiv_mutant_172.display()

Now make for all contact sites¶

In [27]:
empty_charts = []
for site in config['contact_sites']:
    tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
    non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
    non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()

    if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
        contact_plots = plot_affinity_individual_mutants(tmp_df,site)
        empty_charts.append(contact_plots)
    else:
        pass

all_contact_plots = alt.vconcat(*empty_charts).resolve_scale(x='shared', y='shared')
all_contact_plots.display()
In [28]:
for site in list(range(580,590)):
    tmp_df = df_binding_effect_merge[df_binding_effect_merge['site'] == site]
    non_nan_in_effect_E2 = tmp_df['binding_mean_E2'].notnull().any()
    non_nan_in_effect_E3 = tmp_df['binding_mean_E3'].notnull().any()

    if non_nan_in_effect_E2 and non_nan_in_effect_E3:        
        test_plots = plot_affinity_individual_mutants(tmp_df,site)
        test_plots.display()
    else:
        pass

Plot correlations between each site for mean value¶

In [29]:
def plot_affinity_solid_mean(df):
    df = df.dropna()
    means = (
        df.groupby("site")
        .agg(
            {
                "effect_E2": "mean",
                "effect_E3": "mean",
                "binding_mean_E2": "mean",
                "binding_mean_E3": "mean",
                "wildtype": "first",
            }
        )
        .reset_index().round(2)
    )
    slope, intercept, r_value, p_value, std_err = scipy.stats.linregress(
        means["binding_mean_E2"], means["binding_mean_E3"]
    )
    r_value = float(r_value)
    chart = (
        alt.Chart(
            means,
            title=alt.Title(
                "Correlation between Aggregate Mutant Binding Scores",
                subtitle=f"r={r_value:.2f}",
            ),
        )
        .mark_point(size=50, opacity=1,filled=True,color='black')
        .encode(
            x=alt.X(
                "binding_mean_E2",
                title=("Mean bEFNB2 binding"),
                axis=alt.Axis(tickCount=3),
            ),
            y=alt.Y(
                "binding_mean_E3",
                title=("Mean bEFNB3 binding"),
                axis=alt.Axis(tickCount=3),
            ),
            tooltip=[
                "site",
                "wildtype",
                "binding_mean_E2",
                "binding_mean_E3",
                "effect_E2",
                "effect_E3",
            ],
        )
    )
    text = (
        alt.Chart({"values": [{"x": -3.5, "y": 0.5, "text": f"r = {r_value:.2f}"}]})
        .mark_text(align="left", baseline="top", dx=0, dy=0)
        .encode(x=alt.X("x:Q"), y=alt.Y("y:Q"), text="text:N")
    )
    chart_and_text = chart.properties(width=300,height=300)
    return chart_and_text


E2_E3_site_corr = plot_affinity_solid_mean(df_binding_effect_merge)
E2_E3_site_corr.display()
if entry_binding_combined_corr_plot is not None:
    E2_E3_site_corr.save(E2_E3_correlation_site)

Make plot showing binding by site¶

In [30]:
def plot_affinity_by_site(df):
     # define ranges of different RBP regions
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
    }
    
    custom_order = ["Stalk", "Neck", "Linker", "Head"] #custom order for color legend
    
    agg_means = [] #store aggregation 
    
    # For each barrel, filter the dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {"region": barrel, "binding_mean": row["binding_mean"], "site": row["site"],"selection": row["selection"]}
            )
        agg_means_df = pd.DataFrame(agg_means).round(2)
    
    # Setup interactivity
    variant_selector = alt.selection_point(
        on="mouseover", nearest=True, empty=False, fields=["site"], value=0
    )
    #make chart
    chart = alt.Chart(agg_means_df).mark_bar(stroke='black',size=1.5,binSpacing=0,color='black').encode(
        alt.X("site")
            .title("Site")
            .axis(tickCount=5,labelAngle=-90,grid=True),
            #.scale(domain=[70, 602]),
        
        alt.Y('mean(binding_mean)')
            .axis(tickCount=3)
            .title('Mean binding'),
        alt.Row('selection',title=None),
        tooltip=['site'],
        strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
        color=alt.Color('region',sort=custom_order,title='Region',scale=alt.Scale(range=['#5778a4', '#e49444', '#d1615d', '#85b6b2'])),
        #strokeColor=alt.condition(variant_selector, alt.value('orange'), alt.value('black')),


    ).properties(height=150, width=800).add_params(variant_selector).resolve_scale(y='independent')
    

    return chart


binding_by_site = plot_affinity_by_site(df_binding_effect_concat)
binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
    binding_by_site.save(binding_by_site_plot)

Make bar chart showing max binding score for each contact residue¶

In [31]:
def plot_affinity_by_contact_site(df, sites_to_show):
    # Filter the DataFrame based on the sites to show
    contact_df = df[df["site"].isin(sites_to_show)]
    
    # Define a selection for highlighting bars on hover
    variant_selector = alt.selection_point(on="mouseover", nearest=True, empty=False, fields=["site"])
    
    # Create the chart
    chart = alt.Chart(contact_df).mark_bar(stroke='black').encode(
        y=alt.Y("site:N", title="Site",sort='-x'),
        x=alt.X("max(binding_mean):Q", title="Max binding mutant at site"),
        color=alt.condition(variant_selector, alt.value("orange"), alt.value("black")),
        strokeWidth=alt.condition(variant_selector, alt.value(1), alt.value(0)),
        column=alt.Column('selection',title=None)
    ).add_params(variant_selector).properties(width=200, height=alt.Step(12)).resolve_scale(x='independent',y='shared').configure_header(
        labelFontSize=20,  
        labelAngle=0,
        labelAlign='center',
        labelAnchor='middle',
        labelFont='Helvetica Light',
        labelFontStyle='bold',
        labelPadding=3
    )
    
    return chart
In [32]:
contact_binding_by_site = plot_affinity_by_contact_site(df_binding_effect_concat, config["contact_sites"])
contact_binding_by_site.display()
if entry_binding_combined_corr_plot is not None:
    contact_binding_by_site.save(combined_contact_ranked_bar_output)

Make bubble plots for binding in different areas of receptor pocket¶

In [33]:
def make_bubble_binding_region(df):
    barrel_ranges = {
        "Stalk": list(range(70, 148)),
        "Neck": list(range(148, 166)),
        "Linker": list(range(166, 178)),
        "Head": list(range(178, 603)),
        "Receptor Contact": config["contact_sites"],
    }
    custom_order = ["Stalk", "Neck", "Linker", "Head", "Receptor Contact"]
    agg_means = []

    # For each barrel, filter the site_means dataframe to the sites belonging to that barrel and then store the means
    for barrel, sites in barrel_ranges.items():
        subset = df[df["site"].isin(sites)]
        for _, row in subset.iterrows():
            agg_means.append(
                {
                    "region": barrel,
                    "binding_mean": row["binding_mean"],
                    "site": row["site"],
                    "mutant": row["mutant"],
                    "selection": row["selection"],
                }
            )
        agg_means_df = pd.DataFrame(agg_means)

    variant_selector = alt.selection_point(
        on="mouseover", empty=False, nearest=True, fields=["site", "mutant"], value=1
    )

    chart = alt.Chart(agg_means_df).mark_point(stroke="black",filled=True).encode(
        x=alt.X(
            "region:O",
            sort=custom_order,
            title="RBP Region",
            axis=alt.Axis(labelAngle=-90),
        ),
        y=alt.Y(
            "binding_mean",
            title="Binding",
            axis=alt.Axis(tickCount=4),
        ),
        xOffset="random:Q",
        tooltip=["region", "binding_mean", "site", "mutant"],
        color=alt.condition(
            variant_selector, alt.value("orange"), alt.value("black")
        ),
        opacity=alt.condition(variant_selector, alt.value(1), alt.value(0.1)),
        strokeWidth=alt.condition(variant_selector, alt.value(2), alt.value(0)),
        size=alt.condition(variant_selector, alt.value(70), alt.value(20)),
        column=alt.Column('selection',title=None)
    ).transform_calculate(random="sqrt(-1*log(random()))*cos(2*PI*random())"
    ).properties(
        width=300, 
        height=300
    ).add_params(variant_selector
    ).resolve_scale(y='independent'
    ).configure_header(
        labelFontSize=20,  
        labelAngle=0,
        labelAlign='center',
        labelAnchor='middle',
        labelFont='Helvetica Light',
        labelFontStyle='bold',
        labelPadding=1
    )
    
    return chart


entry_region_bubble = make_bubble_binding_region(df_binding_effect_concat)
entry_region_bubble.display()
if entry_binding_combined_corr_plot is not None:
    entry_region_bubble.save(binding_region_bubble_plot)

Plot effects of selected mutants on cell entry¶

In [34]:
display(df_binding_effect_concat)
site wildtype mutant binding_mean binding_std times_seen_binding effect effect_std times_seen_cell_entry frac_models mutant_type selection
0 71 Q D -0.78170 1.646000 3.25 -1.1640 0.88900 4.500 1.0 negative bEFNB2
1 71 Q E 0.16590 0.430900 3.50 -1.2550 0.31230 5.375 1.0 negative bEFNB2
2 71 Q F -0.34290 0.529900 3.00 -1.0580 0.66370 4.625 1.0 aromatic bEFNB2
3 71 Q G 0.46570 0.704200 4.00 -1.4250 0.58780 7.875 1.0 special bEFNB2
4 71 Q H 0.02003 0.405500 2.50 -0.3764 0.36300 2.750 1.0 positive bEFNB2
... ... ... ... ... ... ... ... ... ... ... ... ...
6613 602 T R -0.04309 0.119100 5.00 0.4114 0.18240 6.286 1.0 positive bEFNB3
6614 602 T S 0.08546 0.005451 5.00 0.3778 0.15710 3.429 1.0 hydrophilic bEFNB3
6615 602 T V 0.20070 0.101500 6.00 0.4051 0.13370 5.429 1.0 hydrophobic bEFNB3
6616 602 T W 0.16630 0.134300 6.50 0.5205 0.09616 6.857 1.0 aromatic bEFNB3
6617 602 T Y 0.05261 0.016170 6.50 0.4106 0.23930 6.143 1.0 aromatic bEFNB3

13422 rows × 12 columns

In [35]:
def plot_effects_of_mutants(df):
    tmp_df = df.copy()
    tmp_df = tmp_df[
        #((tmp_df['site'] == 580) & (tmp_df['mutant'] == 'S')) |
        ((tmp_df['site'] == 598) & (tmp_df['mutant'] == 'G')) |
        #((tmp_df['site'] == 492) & (tmp_df['mutant'] == 'L')) |
        ((tmp_df['site'] == 553) & (tmp_df['mutant'] == 'W')) |
        #((tmp_df['site'] == 588) & (tmp_df['mutant'] == 'P')) |
        #((tmp_df['site'] == 492) & (tmp_df['mutant'] == 'R')) |
        #((tmp_df['site'] == 530) & (tmp_df['mutant'] == 'F')) |
        #((tmp_df['site'] == 492) & (tmp_df['mutant'] == 'K')) |
        #((tmp_df['site'] == 239) & (tmp_df['mutant'] == 'H')) |
        ((tmp_df['site'] == 211) & (tmp_df['mutant'] == 'F')) |
        ((tmp_df['site'] == 546) & (tmp_df['mutant'] == 'H')) |
        ((tmp_df['site'] == 143) & (tmp_df['mutant'] == 'Q')) |
        ((tmp_df['site'] == 331) & (tmp_df['mutant'] == 'E')) |
        ((tmp_df['site'] == 566) & (tmp_df['mutant'] == 'C')) 

        #((tmp_df['site'] == 589) & (tmp_df['mutant'] == 'G')) 
    ]
    tmp_df['label'] = (tmp_df['wildtype'].astype(str) + tmp_df['site'].astype(str) + tmp_df['mutant'].astype(str))

    tmp_df = tmp_df.sort_values(by='site')
    binding_chart = alt.Chart(tmp_df).mark_bar().encode(
        alt.X("label:N",title='Mutant',sort=None),
        alt.Y("binding_mean:Q",title='Binding score',axis=alt.Axis(tickCount=4)),
        #xOffset="selection:N",
        color=alt.Color("selection:N",title='Receptor selection'),
        row='selection'
    )
    binding_chart.resolve_scale(y='independent',x='shared').display()
plot_effects_of_mutants(df_binding_effect_concat)
In [ ]: